/**
 * IK 中文分词  版本 5.0
 * IK Analyzer release 5.0
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * 源代码由林良益(linliangyi2005@gmail.com)提供
 * 版权声明 2012，乌龙茶工作室
 * provided by Linliangyi and copyright 2012 by Oolong studio
 *
 * 字符集识别工具类
 */
package org.wltea.analyzer.core;

/**
 *
 * 字符集识别工具类
 */
class CharacterUtil {

	public static final int CHAR_USELESS = 0;

	public static final int CHAR_ARABIC = 0X00000001;

	public static final int CHAR_ENGLISH = 0X00000002;

	public static final int CHAR_CHINESE = 0X00000004;

	public static final int CHAR_OTHER_CJK = 0X00000008;

	/**
	 * 识别字符类型
	 *
	 * @param input
	 * @return int CharacterUtil定义的字符类型常量
	 */
	static int identifyCharType(char input) {
		if ((input >= '0') && (input <= '9')) {
			return CharacterUtil.CHAR_ARABIC;

		} else if (((input >= 'a') && (input <= 'z')) || ((input >= 'A') && (input <= 'Z'))) {
			return CharacterUtil.CHAR_ENGLISH;

		} else {
			Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);

			if ((ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) || (ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS)
					|| (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) || (ub == Character.UnicodeBlock.BASIC_LATIN)) {
				// 目前已知的中文字符UTF-8集合
				return CharacterUtil.CHAR_CHINESE;

			} else if ((ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) // 全角数字字符和日韩字符
					// 韩文字符集
					|| (ub == Character.UnicodeBlock.HANGUL_SYLLABLES) || (ub == Character.UnicodeBlock.HANGUL_JAMO) || (ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO)
					// 日文字符集
					|| (ub == Character.UnicodeBlock.HIRAGANA) // 平假名
					|| (ub == Character.UnicodeBlock.KATAKANA) // 片假名
					|| (ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS)) {
				return CharacterUtil.CHAR_OTHER_CJK;

			}
		}
		// 其他的不做处理的字符
		return CharacterUtil.CHAR_USELESS;
	}

	/**
	 * 进行字符规格化（全角转半角，大写转小写处理）
	 *
	 * @param input
	 * @return char
	 */
	static char regularize(char input) {
		if (input == 12288) {
			input = (char) 32;

		} else if ((input > 65280) && (input < 65375)) {
			input = (char) (input - 65248);

		} else if ((input >= 'A') && (input <= 'Z')) {
			input += 32;
		}

		return input;
	}
}
